## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.1.0 ✓ dplyr 1.0.4
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
Calendar heatmaps are often used to discern patterns, trends, and anomalies over time in a calendar-like interface. In this case, they can be used to understand how AirBnB occupancies and revenue generated over change over the period of 2019 to 2020.
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
We can see that total occupancies in 2019 were generally at least twice that of occupancies in 2020.
## Loading required package: lattice
## Loading required package: grid
## Loading required package: chron
##
## Attaching package: 'chron'
## The following objects are masked from 'package:lubridate':
##
## days, hours, minutes, seconds, years
Occupancy rates are severely reduced in 2020.
Total daily revenue decreased from about 6 million dollars a day to 2 or 3 million dollars a day.
Average daily revenues are also lower in 2020.
We can view the same patterns using line graphs instead.
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
The data tables allow users to see the number of bookings, listings, percentage of listings booked, total revenue, and average revenue for any day that they select.
The patterns in the reviews are similar to those in the occupancy and revenue trends.
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
On average, we see that only 1.5% of guests leave reviews.
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
We were interested to know what makes a host a superhost. We believe that response and acceptance rates could be linked closely to it.
ggplot(data = superhost_summary, aes(x = host_is_superhost, y = avgResponse, fill = host_is_superhost)) +
geom_col(width = 0.5) +
labs(title = 'Average Response Rates of Superhosts and Non-Superhosts') +
ylab('Response Rates (%)') +
theme(plot.title = element_text(hjust = 0.5),
legend.position = 'none',
axis.title.x = element_blank())
library(tidyverse)
###/Users/armaanahmed/Desktop/listings.csv
##/Users/armaanahmed/Desktop/reviews.csv
##/Users/armaanahmed/Desktop/calendar.csv
##install.packages("textdata")
reviews <- read.csv("/Users/armaanahmed/Desktop/Data\ Viz\ AirBNB\ Data/Su/data2/reviews.csv")
listings <- read.csv("/Users/armaanahmed/Desktop/Data\ Viz\ AirBNB\ Data/Su/data2/listings.csv")
airbnb <- inner_join(listings, reviews, by=c("id" = "listing_id"))
## filter 2019-2020 data
airbnb <- airbnb %>% filter(date > "2018-12-31" & date < "2021-01-01")
## How many properties does a host own?
airbnb2 <- airbnb %>% group_by(host_id) %>%
count(id) %>%
arrange(desc(n)) %>%
group_by(host_id) %>%
count() %>% arrange(desc(n))
table(airbnb2$n)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## 12132 1546 413 195 89 48 34 31 18 7 8 6 2
## 14 15 16 17 18 20 21 22 23 24 26 29 30
## 5 1 4 1 2 1 3 2 2 1 2 3 1
## 31 32 34 35 36 37 40 78 91 98
## 1 2 1 1 1 1 2 1 1 1
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(quanteda)
## Package version: 3.0.0
## Unicode version: 10.0
## ICU version: 61.1
## Parallel computing: 6 of 6 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:tm':
##
## stopwords
## The following objects are masked from 'package:NLP':
##
## meta, meta<-
## remove non-english comments
airbnb3 <- airbnb[which(!grepl("[^\x01-\x7F]+", airbnb$comments)),]
## remove stop words
airbnb3$comments <- removeWords(airbnb3$comments, stopwords(language = "en", source = "stopwords-iso"))
airbnb3$comments <- removeWords(airbnb3$comments, stopwords(language = "en", source = "marimo"))
## remove numbers, whitespace, punctuation
airbnb3$comments <- removeNumbers(airbnb3$comments)
airbnb3$comments <- stripWhitespace(airbnb3$comments)
airbnb3$comments <- removePunctuation(airbnb3$comments)
## tolower
airbnb3$comments <- tolower(airbnb3$comments)
##install.packages("tidytext")
library(tidytext)
tidy_ab <- unnest_tokens(airbnb3, output = word, input = comments) %>%
anti_join(stop_words, by = "word")
afinn <- get_sentiments("afinn")
tidy_ab_sent <- inner_join(tidy_ab, afinn, by = "word")
sent_by_rev <- tidy_ab_sent %>%
group_by(host_id, reviewer_id) %>%
mutate(rev_sent = mean(value))
summary(tidy_ab_sent$price)
## Length Class Mode
## 652281 character character
## 75% of the properties are cheaper than $145 per night
tidy_ab_sent <- tidy_ab_sent %>%
group_by(host_id, reviewer_id) %>%
mutate(rev_sent = mean(value))
tidy_ab_sent$sentiment_fac <- cut(tidy_ab_sent$rev_sent, breaks = -5:5)
table(tidy_ab_sent$sentiment_fac)
##
## (-5,-4] (-4,-3] (-3,-2] (-2,-1] (-1,0] (0,1] (1,2] (2,3] (3,4] (4,5]
## 18 638 2334 5194 13835 49593 230525 318297 31337 509
## divide review sentiments into two groups: good and bad
tidy_ab_sent$sentiment <- ifelse(tidy_ab_sent$value > 0, "good", "bad")
tidy_ab_sent %>%
filter(price < 200) %>%
group_by(host_id, reviewer_id) %>%
ggplot(aes(x=neighbourhood_group_cleansed, y=price, color=sentiment)) +
geom_boxplot() +
labs(title="Neighborhood groups and Price by sentiment",
x="Neighborhood Group", y="Price per night($)")
tidy_ab_combined <- tidy_ab_sent %>%
group_by(word, sentiment) %>%
summarise(count = n()) %>% arrange(desc(count))
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.
ab_dtm <- tidy_ab_combined %>% cast_dtm(sentiment, word, count)
ab_dtm
## <<DocumentTermMatrix (documents: 2, terms: 1528)>>
## Non-/sparse entries: 1528/1528
## Sparsity : 50%
## Maximal term length: 17
## Weighting : term frequency (tf)
ab_m <- as.matrix(ab_dtm)
ab_tm <- t(ab_m)
What are the key words that are found in good comments?
library(wordcloud)
## Loading required package: RColorBrewer
## success
filter(tidy_ab_combined, sentiment=="good") %>%
with(wordcloud(word, count, max.words = 100, min.freq=3,scale=c(4,.5),
random.order = FALSE, rot.per=.5,colors="blue"))
Words like Clean, nice, recommend all come up! It seems like cleanliness, aesthetics, and social cues (like recommend) are the most important aspect of a good review.
What are the key words that are found in bad comments?
## success
filter(tidy_ab_combined, sentiment=="bad") %>%
with(wordcloud(word, count, max.words = 100, min.freq=3,scale=c(4,.5),
random.order = FALSE, rot.per=.5,colors="red"))
Having words like noisy, bad, dirty, block, hard stops come up in bad reviews! people want to have a nice, quiet, clean place to stay!
comparison.cloud(ab_tm, colors = c("blue", "red"),
scale=c(3.6,.5), random.order = FALSE, rot.per=.5, title.size= 1,
max.words = 100)
library(readxl)
library(ggplot2)
library(ggthemes)
library(dplyr)
library(maps)
library(tidyverse)
library(tmap)
library(ggmap)
library(hablar)
library(maps)
library(tidyverse)
library(ggmap)
library(rgdal)
library(data.table)
library(devtools)
library(leaflet)
library(geojsonio)
library(readr)
library(RgoogleMaps)
reviews <- read_csv("/Users/armaanahmed/Desktop/untitled\ folder\ 2/reviews.csv")
calendar <- read_csv("/Users/armaanahmed/Desktop/untitled\ folder\ 2/calendar.csv")
listings <- read_csv("/Users/armaanahmed/Desktop/untitled\ folder\ 2/listings.csv")
airbnb <- read_csv("/Users/armaanahmed/Documents/GitHub/Group_O_Airbnb/AB_US_2020.csv")
airbnb<-subset(airbnb, city == "New York City")
##Get rid of unnecessary data in park dataset
##Create base layer map
map_TS_st1 <- get_map("New York City", zoom=12,
source="stamen",maptype="toner-background")
ggmap_TS_st1 <- ggmap(map_TS_st1)
ggmap_TS_st1
map2<-ggmap_TS_st1 + geom_point(aes(x=longitude,y=latitude),data=airbnb,
size=1, alpha=0.9, color="blue")
map2
airbnbdt <- as.data.table(airbnb)
##let's stagger the prices ranges
airbnb$pricerange[airbnb$price > 400] <- "Ultra-expensive?"
airbnb$pricerange[airbnb$price < 400] <- "Expensive"
airbnb$pricerange[airbnb$price < 300] <- "Kinda Pricey"
airbnb$pricerange[airbnb$price < 200] <- "A steal"
airbnb$pricerange[airbnb$price < 100] <- "$99 Bargain"
##do some color work/differentiate price by color
library(RColorBrewer)
pal = colorFactor("Set1", domain = airbnb$pricerange) # Grab a palette
color_offsel1 = pal(airbnb$pricerange)
##popup content
content <- paste("Check this AirBNB out!!", "<br/>",
"Price:",airbnb$price,"<br/>",
"Number of Reviews:",airbnb$number_of_reviews,"<br/>",
"Type of Room:",airbnb$room_type,"<br/>")
interactiveairbnbmap <- leaflet(airbnb, options = leafletOptions(minZoom = 12, maxZoom = 18)) %>% # Create a map widget
addTiles() %>%
addCircles(lat=~latitude, lng=~longitude,color = color_offsel1, popup = content) %>%
addProviderTiles("NASAGIBS.ViirsEarthAtNight2012") %>%setView( lng = -73.96, lat = 40.78, zoom = 14 )
interactiveairbnbmap
clusteredmap <- leaflet(airbnb, options = leafletOptions()) %>% # Create a map widget
addTiles() %>% # Add default OpenStreetMap map tiles
addCircleMarkers(lat=~latitude, lng=~longitude,color = color_offsel1, popup = content, clusterOptions = markerClusterOptions()) %>%
setView( lng = -73.96, lat = 40.78, zoom = 14 ) %>% addLegend(pal = pal, values = airbnb$pricerange, title = "AirBNB's in New York City <br/> Check it out!") %>% addProviderTiles("NASAGIBS.ViirsEarthAtNight2012")
clusteredmap